This data set was facilitated by Prosper
#Subset data to exclude ambiguous employment status, top 1% earners
#and bottom 1%
loans2 <- subset(loans, !(EmploymentStatus %in% c('','Not available','Other')))
loans2 <- subset(loans2, StatedMonthlyIncome<quantile(StatedMonthlyIncome, 0.99)
& StatedMonthlyIncome>quantile(StatedMonthlyIncome, 0.01) )
#Creating data frames with means and medians for Loan Amounts
loan.loan_by_income <- loans2 %>%
group_by(IncomeRange) %>%
summarise(mean_loan_amount=mean(LoanOriginalAmount),
median_loan_amount=median(LoanOriginalAmount))
loan.loan_by_term <- loans2 %>%
group_by(Term) %>%
summarise(mean_loan_amount=mean(LoanOriginalAmount),
median_loan_amount=median(LoanOriginalAmount))
mdata <- melt(loan.loan_by_income, id = c('IncomeRange'))
mdata2 <- melt(loan.loan_by_term, id = c('Term'))
#Counts for Employment Status
ggplot(loans2, aes(EmploymentStatus)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-0.5)
#Loan amounts
ggplot(loans2, aes(LoanOriginalAmount)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Loans defaulted by Original Amount
ggplot(subset(loans2, LoanStatus == 'Defaulted'), aes(LoanOriginalAmount)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Loans defaulted by Income Range
ggplot(subset(loans2, LoanStatus == 'Defaulted'), aes(IncomeRange)) +
geom_histogram(stat = 'count')+
scale_x_discrete(limits=c('Not employed','$1-24,999','$25,000-49,999','$50,000-74,999',
'$75,000-99,999','$100,000+'))
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 21 rows containing non-finite values (stat_count).
#Mean and median loan amount by income range
ggplot(mdata, aes(x = IncomeRange, value, fill = variable))+
geom_bar(position = 'dodge', stat = 'identity')+
scale_x_discrete(limits=c('Not employed','$1-24,999','$25,000-49,999','$50,000-74,999',
'$75,000-99,999','$100,000+'))
## Warning: Removed 4 rows containing missing values (geom_bar).
#Mean and median amount by Term
ggplot(mdata2, aes(x = Term, value, fill = variable))+
geom_bar(position = 'dodge', stat = 'identity')
#Loans by Income range
ggplot(loans2, aes(IncomeRange))+
geom_bar()
#Loan Amount by Monthly Income
ggplot(loans2, aes(StatedMonthlyIncome, LoanOriginalAmount))+
geom_point(alpha = 0.05, position=position_jitter())
#Loan Amount by Monthly Income (Colored)
ggplot(loans2, aes(StatedMonthlyIncome, LoanOriginalAmount, color = IncomeRange))+
geom_point(position=position_jitter())
#Loan Amount by Monthly Income (Term)
ggplot(loans2, aes(StatedMonthlyIncome, LoanOriginalAmount))+
geom_point(alpha = 0.3, position=position_jitter())+
facet_wrap(~Term)
#Loan amount by credit score
ggplot(loans2,aes(CreditScoreRangeLower, LoanOriginalAmount))+
geom_point(alpha = 0.10)+
geom_smooth(method='auto', color='red')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#Rate by credit score
ggplot(loans2,aes(CreditScoreRangeLower, BorrowerRate))+
geom_point(alpha = 0.10)+
geom_smooth(method='auto', color='red')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#APR by credit score
ggplot(loans2,aes(CreditScoreRangeLower, BorrowerAPR))+
geom_point(alpha = 0.10)+
geom_smooth(method='auto', color='red')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
#LenderYield by BorrowerAPR
ggplot(loans2, aes(BorrowerAPR, LenderYield))+
geom_point(alpha = 0.10)
#EstimatedLoss by BorrowerAPR
ggplot(loans2, aes(BorrowerAPR, EstimatedLoss))+
geom_point(alpha = 0.10)
## Warning: Removed 20665 rows containing missing values (geom_point).
#LenderYield by EstimatedLoss
ggplot(loans2, aes(LenderYield, EstimatedLoss))+
geom_point(alpha = 0.10)
## Warning: Removed 20665 rows containing missing values (geom_point).